#import all required libraires
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width',1000)
import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans # import k-means from clustering stage
from sklearn.metrics import silhouette_score
import folium # map rendering library
import json # library to handle JSON files
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
import requests # library to handle requests
import webbrowser
import pgeocode
#getting cordinates for Mumbai
address = 'Mumbai'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Mumbai are {}, {}.'.format(latitude, longitude))
#read data from the webpage
url = 'https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Mumbai'
df1 = pd.read_html(url)
type(df1)
df1
#to convert data into dataframe, first need to convert into nummpy array,then tranpose the list
df2_T0 = np.array(df1[0][0]).T.tolist()
df2_T1 = np.array(df1[0][1]).T.tolist()
df2_T2 = np.array(df1[0][2]).T.tolist()
df2_T3 = np.array(df1[0][3]).T.tolist()
#noq combine the list and convert into a dataframe
df3=pd.DataFrame(list(zip(df2_T0, df2_T1,df2_T2,df2_T3)),
columns=['Area','Location','Latitude','Longitude'])
type(df3)
MumbaiLatLan=df3.drop(df3.index[0])
MumbaiLatLan
# create map of Mumbai using latitude and longitude values
map_Mumbai = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
map_Mumbai
# create map of Mumbai using latitude and longitude values
map_mumbai = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
for lat, lng, label in zip(MumbaiLatLan['Latitude'], MumbaiLatLan['Longitude'],MumbaiLatLan['Area']):
label = folium.Popup(label, parse_html=False)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=1,
parse_html=False).add_to(map_mumbai)
map_mumbai
#Define Foursquare Credentials and Version
#@hidden_cell
CLIENT_ID = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' # your Foursquare ID
CLIENT_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
#Explore Neighborhoods in Mumbai
def getNearbyVenues(names, latitudes, longitudes, radius=2500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
#Now write the code to run the above function on each neighborhood and create a new dataframe called Mumbai_venues.
# type your answer here
Mumbai_venues = getNearbyVenues(names=MumbaiLatLan['Area'],
latitudes=MumbaiLatLan['Latitude'],
longitudes=MumbaiLatLan['Longitude']
)
print(Mumbai_venues.shape)
Mumbai_venues.head(5)
#Let's check how many venues were returned for each neighborhood
Mumbai_venues.groupby('Neighborhood').count()
#Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(Mumbai_venues['Venue Category'].unique())))
# Now just keeing Shopping malls & Theatre for the analysis
Mumbai_venues_2=Mumbai_venues[(Mumbai_venues['Venue Category']=='Shopping Mall') | (Mumbai_venues['Venue Category']=='Movie Theater')
| (Mumbai_venues['Venue Category']=='Theater') | (Mumbai_venues['Venue Category']=='Indie Movie Theater')]
Mumbai_venues_2['Venue Category'] = Mumbai_venues_2['Venue Category'].replace(['Movie Theater','Indie Movie Theater'],'Theater')
Mumbai_venues_2
#3. Analyze Each Neighborhood
# one hot encoding
Mumbai_onehot = pd.get_dummies(Mumbai_venues_2[['Venue Category']], prefix="", prefix_sep="")
Mumbai_onehot
# add neighborhood column back to dataframe
Mumbai_onehot['Neighborhood'] = Mumbai_venues_2['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [Mumbai_onehot.columns[-1]] + list(Mumbai_onehot.columns[:-1])
Mumbai_onehot = Mumbai_onehot[fixed_columns]
Mumbai_onehot.head()
Mumbai_onehot.shape
#Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
Mumbai_grouped_1 = Mumbai_onehot.groupby('Neighborhood').sum().reset_index()
Mumbai_grouped_2 = Mumbai_onehot.groupby('Neighborhood').mean().reset_index()
Mumbai_grouped= Mumbai_grouped_1.merge(Mumbai_grouped_2,how='inner',left_on=['Neighborhood'],right_on=['Neighborhood'])
Mumbai_grouped
Mumbai_grouped.shape
Mumbai_grouped.head()
#4. Cluster Neighborhoods findout best K
Mumbai_grouped_clustering = Mumbai_grouped_2.drop('Neighborhood', 1)
sil = []
kmax = 10
# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in range(2, kmax):
kmeans = KMeans(n_clusters=k, random_state=1).fit(Mumbai_grouped_clustering)
labels = kmeans.labels_
sil.append(silhouette_score(Mumbai_grouped_clustering, labels, metric = 'euclidean'))
sil
# Plot the elbow
k= range(2,10)
plt.plot(k, sil, 'bx-')
plt.xlabel('k')
plt.ylabel('sil')
plt.title('The Silhouette Method showing the optimal k')
plt.show()
# The best number of clusters with min of cluster value is 6 from above chart
# set number of clusters
kclusters = 5
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(Mumbai_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
#Mumbai_grouped_clustering
#Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
# add clustering labels
Mumbai_grouped.insert(0,'Cluster Labels', kmeans.labels_)
Mumbai_merged = MumbaiLatLan
# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
Mumbai_merged = Mumbai_merged.merge(Mumbai_grouped,how='inner',left_on=['Area'],right_on=['Neighborhood'])
Mumbai_merged.head() # check the last columns!
#updaating lables for each cluster
Mumbai_merged['Labels'] = Mumbai_merged['Cluster Labels'].map({0:'Only Theater',1:'Only Shopping Mall'
,2:'66% Shopping Mall/33% Theater',3:'33% Shopping Mall/66% Theater'
,4:'75% Shopping Mall/25% Theater'})
Mumbai_merged
#Finally, let's visualize the resulting clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster,Labels, X1,X2 in zip(Mumbai_merged['Latitude'], Mumbai_merged['Longitude'], Mumbai_merged['Neighborhood'], Mumbai_merged['Cluster Labels'],Mumbai_merged['Labels'], Mumbai_merged['Shopping Mall_x'], Mumbai_merged['Theater_x']):
label = folium.Popup(str(poi) + ' Area' +' has-'+ str(Labels)+'; Shopping Mall-'+str(X1)+' Theater-'+str(X2), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.7).add_to(map_clusters)
map_clusters
# get mumbai geojson data
mumbai_geo = r'https://raw.githubusercontent.com/datameet/PincodeBoundary/master/Mumbai/boundary.geojson' # geojson file
# get Mumbai area wise density information
Mumbai_den = pd.read_csv(r'C:\Users\Puneet\Desktop\Data_Casptone_project\Mumbai_density.csv')
Mumbai_den.head()
# generate choropleth map to visualiaze the populatin density
map_mumbai = folium.Map(location=[latitude, longitude], zoom_start=11)
map_mumbai.choropleth(
geo_data=mumbai_geo,
data=Mumbai_den,
columns=['Area', 'Density_Square_Kilometer'],
key_on='feature.properties.name',
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Mumbai Population Density per KM'
)
# display map
map_mumbai
#Finally, let's visualize the resulting clusters and Mumbai city population density in a single map
# create map
map_clusters = map_mumbai
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster,Labels, X1,X2 in zip(Mumbai_merged['Latitude'], Mumbai_merged['Longitude'], Mumbai_merged['Neighborhood'], Mumbai_merged['Cluster Labels'],Mumbai_merged['Labels'], Mumbai_merged['Shopping Mall_x'], Mumbai_merged['Theater_x']):
label = folium.Popup(str(poi) + ' Area' +' has-'+ str(Labels)+'; Shopping Mall-'+str(X1)+' Theater-'+str(X2), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.7).add_to(map_clusters)
map_clusters
#The End